# !/usr/bin/env python
# -*- coding:utf8 -*-

"""
#全国企业信用信息公示系统（广东）
"""

import sys
import traceback
import time
import re
from scpy.logger import get_logger
import requests
from bs4 import BeautifulSoup
from utils import kill_captcha
import copy
import json
import random
import sd_template_dict as TE
import sd_format as FO
import gd_trans_dict as TR
import table
import datetime
import gd_kill_js

# 深圳请求之间必须延时,否者不返回数据,返回的是错误页面
SZ_SLEEP_TIME = 1
GD_SLEEP_TIME = 1

reload(sys)
sys.setdefaultencoding('utf8')

logger = get_logger(__file__)


def exception_redo(try_time, default_return):
    """
    多次执行某函数,如果依然出现错误,返回默认值
    :param try_time:
    :param default_return:
    :return:
    """

    def _func(func):
        def __func(*args, **kwargs):
            for a_try_time in xrange(try_time):
                try:
                    result = func(*args, **kwargs)
                    return result
                except Exception, e:
                    continue

            return default_return

        return __func

    return _func


def trans_time(raw_time):
    """
    translate time format
    eg:
        input: "Feb 20, 2009 12:00:00 AM"
        output: "1994-12-09 00:00:00"
    :param raw_time:
    :return:
    """
    # 时间格式 todo "acConDate": "Feb 20, 2009 12:00:00 AM",
    return datetime.datetime.strptime(raw_time, "%b %d, %Y %H:%M:%S %p").strftime(
        "%Y-%m-%d %H:%M:%S") if raw_time else ''


UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"


# index_url = "http://gsxt.gdgs.gov.cn/"
req = requests.session()
req.headers = {"User-Agent": UserAgent}
# index_res = req.get(index_url, timeout=30)


def download_captcha_kill(companyName):
    index_url = "http://gsxt.gdgs.gov.cn/"
    img_url = "http://gsxt.gdgs.gov.cn/aiccips/verify.html"
    check_url_1 = "http://gsxt.gdgs.gov.cn/aiccips/CheckEntContext/checkCode.html"
    check_url_2 = "http://gsxt.gdgs.gov.cn/aiccips/CheckEntContext/showInfo.html"
    # req = requests.session()
    # req.headers = {"User-Agent": UserAgent}
    # index_res.cookies.values()
    index_res = req.get(index_url, timeout=30)
    # img_res_0 = req.get(img_url, timeout=30)
    img_res = req.get(img_url + '?random=%s' % random.random(), timeout=30)
    # img_res.cookies.values()
    # with open("./gd_2.png", "wb") as fp:
    #     fp.write(img_res.content)
    #
    # captcha = raw_input("captcha=")

    captcha = kill_captcha(img_res.content, 'gd', 'png')
    logger.error('验证码为:%s' % captcha)
    if not captcha or len(captcha) > 100 or str(captcha) in ['None', 'wrong']:
        logger.error('验证码为:%s' % captcha)
        logger.error("破解验证码的服务，出现异常,可能是下载的验证码错误，也可能破解服务出现异常")
        # 返回空字符串，用于重复破解
        return ""

    check_res_1 = req.post(check_url_1, data={
        "textfield": companyName,
        "code": captcha, }, timeout=30)

    if json.loads(check_res_1.content).get("flag") != "1":
        logger.info(check_res_1.content)
        logger.error("验证码错误")
        # time.sleep(2)
        return ""

    textfield = json.loads(check_res_1.content).get("textfield")
    check_res_2 = req.post(check_url_2, data={
        "textfield": textfield,
        "code": captcha, }, timeout=30)

    if '暂未查询到相关记录' in check_res_2.content:
        logger.info('暂未查询到相关记录')
        return None

    com_url = re.findall('''<div class="list">.*?href="(.*?)"''', check_res_2.content, re.S)
    if not com_url:
        logger.info('no url')
        # logger.info(check_res_2.content)
        return None
    else:
        logger.info(com_url[0])
        return com_url[0]


def get_company_info(com_info):
    """
    根据不同的URL调用不同网页下载函数
    :param com_info: URL
    :return:
    """
    # 深圳
    if 'http://www.szcredit.com.cn/' in com_info:
        return get_company_info_1(com_info)

    # todo 2
    # elif 'http://121.8.226.101:7001/' in com_info:
    #     return get_company_info_2(com_info)

    # 广东省内除广州和深圳
    elif '../' in com_info:
        return get_company_info_3(com_info)

    # 广州
    elif 'http://gsxt.gzaic.gov.cn/' in com_info:
        return get_company_info_3(com_info)
    else:
        logger.info('New type URL !')
        raise Exception('U need to add new type function!')


def get_company_info_1(com_info):
    """
    深圳，请求之间需要适当的延时。
    :param com_info:
    :return:
    """
    # req = requests.session()
    req.headers = {"User-Agent": UserAgent, "Referer": "http://gsxt.gdgs.gov.cn/aiccips/CheckEntContext/showInfo.html"}
    raw_dict = {
        "province": "gd",
        "type": "1",
        "html": {},
        "yearList": [],
        "keyword": "",
        "companyName": "",
        "json": "",
    }
    raw_html = {}
    # 基本信息
    raw_base_res = req.get(com_info, timeout=30).content
    try:
        raw_base_res = raw_base_res.decode('gbk').encode('utf8')
    except:
        raw_base_res = str(BeautifulSoup(raw_base_res, 'html5lib', from_encoding='gbk'))
    raw_html['base'] = raw_base_res
    time.sleep(SZ_SLEEP_TIME)
    # 股东详情
    share_detail_url_list = re.findall("'(EntSHDetail\.aspx\?rid=.*?)'", raw_base_res)
    raw_share_detail_list = []
    for a_share_detail_url in share_detail_url_list:
        time.sleep(SZ_SLEEP_TIME)
        a_share_detail_url = 'http://www.szcredit.com.cn/web/GSZJGSPT/' + a_share_detail_url
        # a_share_detail_res = req.get(a_share_detail_url, timeout=30).content.decode('gbk').encode('utf8')
        a_share_detail_res = req.get(a_share_detail_url, timeout=30).content
        try:
            a_share_detail_res = a_share_detail_res.decode('gbk').encode('utf8')
        except:
            a_share_detail_res = str(BeautifulSoup(a_share_detail_res, 'html5lib', from_encoding='gbk'))
        raw_share_detail_list.append(a_share_detail_res)
    raw_html['share_detail'] = raw_share_detail_list
    time.sleep(SZ_SLEEP_TIME)
    # 变更信息
    view_state = re.findall('id="__VIEWSTATE"value="(.+?)"', raw_base_res.replace(' ', ''))
    rator = re.findall('id="__VIEWSTATEGENERATOR"value="(.+?)"', raw_base_res.replace(' ', ''))
    alter_req_data = {
        "ScriptManager1": "biangengxinxi|Timer2",
        "__EVENTTARGET": "Timer2",
        "__EVENTARGUMENT": "",
        "__VIEWSTATE": view_state,
        "__VIEWSTATEGENERATOR": rator,
        "__ASYNCPOST": "true",
        "": ""}
    # raw_alter_res = req.post(com_info, data=alter_req_data, timeout=30).content.decode('gbk').encode('utf8')
    raw_alter_res = req.post(com_info, data=alter_req_data, timeout=30).content
    try:
        raw_alter_res = raw_alter_res.decode('gbk').encode('utf8')
    except:
        raw_alter_res = str(BeautifulSoup(raw_alter_res, 'html5lib', from_encoding='gbk'))
    raw_html['alter'] = raw_alter_res

    raw_dict['html'] = raw_html
    time.sleep(SZ_SLEEP_TIME)
    # 年报
    year_index_url = com_info.replace('xyDetail', 'nbDetail')
    # year_index_res = req.get(year_index_url, timeout=30).content.decode('gbk').encode('utf8')
    year_index_res = req.get(year_index_url, timeout=30).content
    try:
        year_index_res = year_index_res.decode('gbk').encode('utf8')
    except:
        year_index_res = str(BeautifulSoup(year_index_res, 'html5lib', from_encoding='gbk'))
    year_url_s = re.findall("'(http://www\.szcredit\.com\.cn/web/GSZJGSPT/NBGSDetai\.aspx\?Entid=.*?)'", year_index_res)
    raw_year_list = []
    for a_year_url in year_url_s:
        time.sleep(SZ_SLEEP_TIME)
        raw_year_dict = {}
        year = re.findall('Year=(\d+)', a_year_url)
        year = year[0] if year else ''
        # raw_a_year_res = req.get(a_year_url, timeout=30).content.decode('gbk').encode('utf8')
        raw_a_year_res = req.get(a_year_url, timeout=30).content
        try:
            raw_a_year_res = raw_a_year_res.decode('gbk').encode('utf8')
        except:
            # 注意这里不能加 from_encoding='gbk'
            raw_a_year_res = str(BeautifulSoup(raw_a_year_res, 'html5lib'))
        raw_year_dict['year'] = year
        raw_year_dict['year_base'] = raw_a_year_res

        raw_year_list.append(raw_year_dict)

    raw_dict['yearList'] = raw_year_list

    return raw_dict


def get_company_info_2(com_info):
    # todo 暂时未发现此类url
    raw_dict = {
        "province": "gd",
        "type": "2",
        "html": {},
        "yearList": [],
        "keyword": "",
        "companyName": "",
        "json": "",
    }
    return None


def get_company_info_3(com_info):
    """
    广东、东莞、珠海、汕头，兼容广州等
    :param com_info:
    :return:
    """
    req = requests.session()
    req.headers = {"User-Agent": UserAgent}
    raw_dict = {
        "province": "gd",
        "type": "3",
        "html": {},
        "yearList": [],
        "keyword": "",
        "companyName": "",
        "json": "",
    }
    raw_base_dict = {}
    if '../' in com_info:
        base_url = com_info.replace('../', 'http://gsxt.gdgs.gov.cn/aiccips/')
        root_url = 'http://gsxt.gdgs.gov.cn/aiccips/'
    elif 'http://gsxt.gzaic.gov.cn/aiccips/' in com_info:
        base_url = com_info
        root_url = 'http://gsxt.gzaic.gov.cn/aiccips/'
    else:
        raise Exception()

    # kill_cookie
    time.sleep(GD_SLEEP_TIME)
    index_cookie_res = req.get(base_url, timeout=30)
    if index_cookie_res.status_code == 521:
        index_cookie_dict = index_cookie_res.cookies.get_dict()
        # if r'eval(y.replace(/\b\w+\b/g, function(y){return x[f(y,z)-1]}));</script>' in index_cookie_res.content:
        js_cookie = gd_kill_js.gd_kill_cookie(index_cookie_res.content)
        # js_cookie = re.findall('=(.*)', js_cookie)
        # js_cookie = js_cookie[0] if js_cookie else ''

        # index_req_cookies = {
        #     '__jsluid': index_cookie_dict.get('__jsluid', ''),
        #     '__jsl_clearance': js_cookie or index_cookie_dict.get('__jsl_clearance', ''),
        # }
        req.headers = {
            "User-Agent": UserAgent,
            "Cookie": "__jsluid=%s;" % index_cookie_dict.get('__jsluid', '') + js_cookie,
            "Referer": base_url,
        }
    # else:
    #     index_req_cookies = ''

    # req.headers = index_req_cookies
    time.sleep(GD_SLEEP_TIME)
    raw_base_res = req.get(base_url, timeout=30).content
    # raw_base_res = req.get(base_url, cookies=index_req_cookies, timeout=30).content

    ent_no = re.findall('name="entNo" value="(.+?)"', raw_base_res)
    ent_type = re.findall('id="entType" name="entType" value="(.+?)"', raw_base_res)
    reg_org = re.findall('id="regOrg" name="regOrg" value="(.+?)"', raw_base_res)
    if not ent_no and not ent_type and not reg_org:
        raw_base_dict = {'base': raw_base_res, 'ba': '', 'abnormal': '', 'check_message': '', 'share_detail_list': []}
        raw_dict['html'] = raw_base_dict
        return raw_dict

    # 以直接获取网页方式访问
    req_data = {"entNo": ent_no[0], "entType": ent_type[0], "regOrg": reg_org[0]}

    # 以获取网页json方式翻页,pageNo 为1 即可返回全部数据
    req_json_data = {'pageNo': '1', 'entNo': ent_no[0], 'regOrg': reg_org[0], }

    # 股东翻页
    share_url = root_url + 'GSpublicity/invInfoPage.html'
    time.sleep(GD_SLEEP_TIME)
    try:
        # raw_share_res = req.post(share_url, data=req_json_data, cookies=index_req_cookies, timeout=30).content
        raw_share_res = req.post(share_url, data=req_json_data, timeout=30).content
    except:
        raw_share_res = ''
    raw_base_dict['share'] = raw_share_res
    # # 股东详情 todo json方式包含股东详情， 可以不用获取再获取了
    # share_detail_url_list = re.findall("(http://gsxt\.gdgs\.gov\.cn/aiccips/GSpublicity/invInfoDetails\.html.*?)'", raw_base_res)
    # share_detail_url_list = share_detail_url_list or re.findall("(http://gsxt\.gzaic\.gov\.cn/aiccips/GSpublicity/invInfoDetails\.html.*?)'", raw_base_res)
    # raw_share_detail_list = []
    # for a_share_detail_url in share_detail_url_list:
    #     raw_a_share_detail_res = req.get(a_share_detail_url).content
    #     raw_share_detail_list.append(raw_a_share_detail_res)

    # 变更翻页
    alter_url = root_url + 'GSpublicity/entChaPage.html'
    req_alter_json_data = {'pageNo': '2', 'entNo': ent_no[0], 'regOrg': reg_org[0], 'entType': ent_type[0]}
    time.sleep(GD_SLEEP_TIME)
    try:
        # raw_alter_res = req.post(alter_url, data=req_alter_json_data, cookies=index_req_cookies, timeout=30).content
        raw_alter_res = req.post(alter_url, data=req_alter_json_data, timeout=30).content
    except:
        raw_alter_res = ''
    raw_base_dict['alter'] = raw_alter_res

    # 备案
    # ba_url = root_url + 'GSpublicity/GSpublicityList.html?service=entCheckInfo'
    # raw_ba_res = req.post(ba_url, data=req_data).content
    # raw_base_dict['ba'] = raw_ba_res
    raw_base_dict['ba'] = ''

    # 人员翻页
    person_url = root_url + 'GSpublicity/vipInfoPage'
    time.sleep(GD_SLEEP_TIME)
    try:
        # raw_person_res = req.post(person_url, data=req_json_data, cookies=index_req_cookies, timeout=30).content
        raw_person_res = req.post(person_url, data=req_json_data, timeout=30).content
    except:
        raw_person_res = ''
    raw_base_dict['person'] = raw_person_res

    # 分支机构翻页
    branch_url = root_url + 'GSpublicity/braInfoPage'
    time.sleep(GD_SLEEP_TIME)
    try:
        raw_branch_res = req.post(branch_url, data=req_json_data, timeout=30).content
        # raw_branch_res = req.post(branch_url, data=req_json_data, cookies=index_req_cookies, timeout=30).content
    except:
        raw_branch_res = ''
    raw_base_dict['branch'] = raw_branch_res

    # 经营异常
    abnormal_url = root_url + 'GSpublicity/GSpublicityList.html?service=cipUnuDirInfo'
    time.sleep(GD_SLEEP_TIME)
    try:
        # raw_abnormal_res = req.post(abnormal_url, data=req_data, cookies=index_req_cookies, timeout=30).content
        raw_abnormal_res = req.post(abnormal_url, data=req_data, timeout=30).content
    except:
        raw_abnormal_res = ''
    raw_base_dict['abnormal'] = raw_abnormal_res

    # 抽查检查
    check_message_url = root_url + 'GSpublicity/GSpublicityList.html?service=cipSpotCheInfo'
    time.sleep(GD_SLEEP_TIME)
    try:
        raw_check_message_res = req.post(check_message_url, data=req_data, timeout=30).content
        # raw_check_message_res = req.post(check_message_url, data=req_data, cookies=index_req_cookies, timeout=30).content
    except:
        raw_check_message_res = ''
    raw_base_dict['check_message'] = raw_check_message_res

    # 年报信息
    year_index_url = root_url + 'BusinessAnnals/BusinessAnnalsList.html'
    time.sleep(GD_SLEEP_TIME)
    try:
        year_index_res = req.post(year_index_url, data=req_data, timeout=30).content
        # year_index_res = req.post(year_index_url, data=req_data, cookies=index_req_cookies, timeout=30).content
    except:
        year_index_res = ''

    year_url_list = re.findall('href="(http://gsxt\.gdgs\.gov\.cn/aiccips/BusinessAnnals/view\.html.*?)"',
                               year_index_res, re.S) if year_index_res else []
    year_url_list = year_url_list or re.findall(
        'href="(http://gsxt\.gzaic\.gov\.cn/aiccips/BusinessAnnals/view\.html.*?)"', year_index_res,
        re.S) if year_index_res else []
    raw_year_list = []
    for a_year_url in year_url_list:
        raw_year_dict = {}
        year = re.findall('reportYear=(\d+)&', a_year_url)
        year = year[0] if year else ''
        time.sleep(GD_SLEEP_TIME)
        a_year_res = req.get(a_year_url, timeout=30).content
        # a_year_res = req.get(a_year_url, cookies=index_req_cookies, timeout=30).content
        raw_year_dict["year"] = year
        raw_year_dict["year_base"] = a_year_res

        raw_year_list.append(raw_year_dict)

    raw_base_dict['base'] = raw_base_res
    # raw_base_dict['share_detail'] = raw_share_detail_list
    raw_base_dict['share_detail'] = []

    raw_dict['html'] = raw_base_dict

    raw_dict['yearList'] = raw_year_list

    return raw_dict


def extract_base_info(raw_dict):
    parse_type = raw_dict.get('type', '')
    if parse_type == '1':
        res_base_dict = extract_base_info_1(raw_dict)

    # elif parse_type == '2':
    #     # res_base_dict = extract_base_info_2(raw_dict)
    #     pass
    elif parse_type == '3':
        res_base_dict = extract_base_info_3(raw_dict)
    else:
        raise Exception("extract_base_info error")

    return res_base_dict


def extract_base_info_1(raw_dict):
    if not raw_dict:
        raise Exception("raw_dict 错误")

    raw_html = raw_dict.get("html", {})
    if not raw_html:
        raise Exception("raw_dict 错误")

    # 基本信息
    raw_base = raw_html.get("base")
    raw_base_table = table.table_clean(raw_base, "基本信息")
    if not raw_base_table:
        raise Exception("基本信息错误")
    res_base_dict = copy.deepcopy(TE.void_base_dict)
    res_base_dict["basicList"] = table.index("基本信息", raw_base_table)
    res_base_dict["province"] = "gd"

    # 股东信息
    share_table = table.table_clean(raw_base, "股东信息") or (
    table.table_clean(raw_base, "投资人信息") or table.table_clean(raw_base, "股东（发起人）信息"))

    res_base_dict["shareHolderList"] = table.index("股东信息", share_table) if share_table else []

    # 股东详情 合并 todo

    # 变更信息
    raw_alter_html = table.table_clean(raw_base, "变更信息")
    res_base_dict["alterList"] = table.index("变更信息", raw_alter_html) if raw_alter_html else []

    # 主要人员信息
    raw_person_html = table.table_clean(raw_base, "主要人员信息") or table.table_clean(raw_base, "家庭成员信息")
    res_base_dict["personList"] = table.index("主要人员信息", raw_person_html) if raw_person_html else []

    # 分支机构信息
    raw_branch_html = table.table_clean(raw_base, "分支机构信息")
    res_base_dict["filiationList"] = table.index("分支机构信息", raw_branch_html) if raw_branch_html else []

    # 清算信息
    raw_liquidation_html = table.table_clean(raw_base, "清算信息")
    res_base_dict["liquidationList"] = table.index("清算信息", raw_liquidation_html) if raw_liquidation_html else []

    # 经营异常信息
    raw_abnormal_html = table.table_clean(raw_base, "经营异常信息") or table.table_clean(raw_base, "经营异常")
    res_base_dict["abnormalOperation"] = table.index("经营异常信息", raw_abnormal_html) if raw_abnormal_html else []

    # 抽查检查信息
    raw_check_message_html = table.table_clean(raw_base, "抽查检查信息")
    res_base_dict["checkMessage"] = table.index("抽查检查信息", raw_check_message_html) if raw_check_message_html else []

    return res_base_dict


def extract_base_info_3(raw_dict):
    if not raw_dict:
        raise Exception("raw_dict 错误")

    raw_html = raw_dict.get("html", {})
    if not raw_html:
        raise Exception("raw_dict 错误")

    # 基本信息
    raw_base = raw_html.get("base")
    raw_base_table = table.table_clean(raw_base, "基本信息")
    if not raw_base_table:
        raise Exception("基本信息错误")
    res_base_dict = copy.deepcopy(TE.void_base_dict)
    res_base_dict["basicList"] = table.index("基本信息", raw_base_table)
    res_base_dict["province"] = "gd"

    """
    从网页解析
    """
    # # 股东信息
    # share_table = table.table_clean(raw_base, "股东信息") or (table.table_clean(raw_base, "投资人信息") or table.table_clean(raw_base, "股东（发起人）信息"))
    # res_base_dict["shareHolderList"] = table.index("股东信息", share_table) if share_table else []
    #
    # # 股东详情 合并 todo
    #
    # # 变更信息
    # raw_alter_html = table.table_clean(raw_base, "变更信息")
    # res_base_dict["alterList"] = table.index("变更信息", raw_alter_html) if raw_alter_html else []
    #
    # raw_beian = raw_html.get("ba", "")
    # if raw_beian:
    #     # 主要人员信息
    #     raw_person_html = table.table_clean(raw_beian, "主要人员信息") or table.table_clean(raw_base, "家庭成员信息")
    #     res_base_dict["personList"] = table.index("主要人员信息", raw_person_html) if raw_person_html else []
    #
    #     # 分支机构信息
    #     raw_branch_html = table.table_clean(raw_beian, "分支机构信息")
    #     res_base_dict["filiationList"] = table.index("分支机构信息", raw_branch_html) if raw_branch_html else []
    #
    #     # 清算信息
    #     raw_liquidation_html = table.table_clean(raw_beian, "清算信息")
    #     res_base_dict["liquidationList"] = table.index("清算信息", raw_liquidation_html) if raw_liquidation_html else []

    """
    从json解析
    """
    # 股东信息
    raw_share = raw_html.get("share", {}) or {}
    raw_share_list = json.loads(raw_share).get('list', []) if raw_share else []
    share_holder_list = []
    for a_share in raw_share_list:
        if a_share:
            a_share["acConDate"] = trans_time(a_share.get("acConDate", '') or '')
            share_holder_list.append(FO.transform_dict(TE.shareHolder_dict, TR.shareHolder_dict, a_share))
    res_base_dict["shareHolderList"] = share_holder_list

    # 变更信息
    raw_alter = raw_html.get("alter", {}) or {}
    raw_alter_list = json.loads(raw_alter).get('list', []) if raw_alter else []
    alter_list = []
    for a_alter in raw_alter_list:
        if a_alter:
            a_alter["altDate"] = trans_time(a_alter.get("altDate", '') or '')
            alter_list.append(FO.transform_dict(TE.alter_dict, TR.alter_dict, a_alter))
    res_base_dict["alterList"] = alter_list

    # 主要人员信息
    raw_person = raw_html.get("person", {}) or {}
    raw_person_list = json.loads(raw_person).get('list', []) if raw_person else []
    person_list = []
    for a_person in raw_person_list:
        if a_person:
            person_list.append(FO.transform_dict(TE.person_dict, TR.person_dict, a_person))
    res_base_dict["personList"] = person_list

    # 分支机构信息
    raw_branch = raw_html.get("branch", {}) or {}
    raw_branch_list = json.loads(raw_branch).get('list', []) if raw_branch else []
    filiation_list = []
    for a_branch in raw_branch_list:
        if a_branch:
            filiation_list.append(FO.transform_dict(TE.filiation_dict, TR.filiation_dict, a_branch))
    res_base_dict["filiationList"] = filiation_list

    # 经营异常信息
    raw_abnormal_html = raw_html.get("abnormal", "")
    res_base_dict["abnormalOperation"] = table.index("经营异常信息", raw_abnormal_html) if raw_abnormal_html else []

    # 抽查检查信息
    raw_check_message_html = raw_html.get("check_message", "")
    res_base_dict["checkMessage"] = table.index("抽查检查信息", raw_check_message_html) if raw_check_message_html else []

    return res_base_dict


def extract_year_info(raw_dict):
    parse_type = raw_dict.get('type', '')
    if parse_type == '1':
        res_year_list = extract_year_info_3(raw_dict)
    # elif parse_type == '2':
    #     res_year_list = extract_year_info_3(raw_dict)
    elif parse_type == '3':
        res_year_list = extract_year_info_3(raw_dict)
    else:
        raise Exception("extract_year_info error")

    return res_year_list


def extract_year_info_3(raw_dict):
    # 年报
    if not raw_dict:
        return None
    raw_year_json_list = raw_dict.get("yearList", [])
    if not raw_year_json_list:
        return []

    res_year_list = []
    for a_year_item in raw_year_json_list:
        res_year_dict = {}
        year = a_year_item.get("year", "")
        raw_year_base = a_year_item.get("year_base", "")
        raw_year_base = re.sub('<span.*?>', '', raw_year_base).replace('</span>', '') if raw_year_base else ''

        # 企业基本信息
        year_report_base = table.table_clean(raw_year_base, '企业基本信息') or table.table_clean(raw_year_base,
                                                                                           "基本信息") or table.table_clean(
            raw_year_base, '登记事项基本信息')
        res_year_dict['baseInfo'] = table.report_index('企业基本信息', year_report_base) if year_report_base else {}
        res_year_dict['year'] = year

        # 网站或网店信息
        year_report_web = table.table_clean(raw_year_base, '网站或网店信息')
        res_year_dict['website'] = table.report_index('网站或网店信息', year_report_web) if year_report_web else {}

        # 发起人及出资信息
        year_report_investor = table.table_clean(raw_year_base, '发起人及出资信息') or table.table_clean(raw_year_base,
                                                                                                 '股东及出资信息')
        try:
            res_year_dict['investorInformations'] = table.report_index('股东及出资信息',
                                                                       year_report_investor) if year_report_investor else []
        except:
            res_year_dict['investorInformations'] = []
        # 企业资产状况信息
        year_report_asset = table.table_clean(raw_year_base, '企业资产状况信息') or table.table_clean(raw_year_base, '资产状况信息')
        res_year_dict['assetsInfo'] = table.report_index('企业资产状况信息', year_report_asset) if year_report_asset else {}

        # 股权变更信息
        year_report_equity = table.table_clean(raw_year_base, '股权变更信息')
        try:
            res_year_dict['equityChangeInformations'] = table.report_index('股权变更信息',
                                                                           year_report_equity) if year_report_equity else []
        except:
            res_year_dict['equityChangeInformations'] = []

        # 修改记录
        year_report_change = table.table_clean(raw_year_base, '修改记录')
        res_year_dict['changeRecords'] = table.report_index('修改记录', year_report_change) if year_report_change else []

        # 对外投资信息
        year_report_invest = table.table_clean(raw_year_base, '对外投资信息') or table.table_clean(raw_year_base,
                                                                                             '对外投资') or table.table_clean(
            raw_year_base, '对外投资情况')
        try:
            res_year_dict['entinvItemList'] = table.report_index('对外投资信息',
                                                                 year_report_invest) if year_report_invest else []
        except:
            res_year_dict['entinvItemList'] = []

        res_year_list.append(res_year_dict)

    return res_year_list


def search2(companyName, MAXTIME=20):
    res = ''
    asic_dict = {}
    # MAXTIME = 20
    a_time = MAXTIME
    while a_time > 0:
        # print res, '*'*20
        if res is None:  # 公司不存在
            return None
        elif res == '':  # 验证码错误
            if a_time < MAXTIME:
                logger.error("重复破解验证码!当前设定重复破解次数为:%s, 剩余次数为:%s " % (MAXTIME, a_time))
            a_time -= 1
            try:
                # time.sleep(10)
                res = download_captcha_kill(companyName)
                # print res
            except Exception, e:
                traceback.print_exc(e)
                logger.info(e)
                time.sleep(2)
                # raise e
                # res == ''
                continue
        else:
            break

    if a_time <= 1 and res == '':
        raise Exception("多次破解验证码错误,当前设置次数为：%s" % MAXTIME)
    else:
        com_list = res
        for i in xrange(MAXTIME):
            try:
                res = get_company_info(com_list)
                break
            except requests.HTTPError:
                continue

        raw_dict = res
        try:
            asic_dict = extract_base_info(raw_dict)
            year_list = extract_year_info(raw_dict)
            company_name = asic_dict['basicList'][0].get('enterpriseName', '')
            company_name = company_name if company_name else companyName
            res['companyName'] = company_name

            asic_dict['yearReportList'] = year_list
            gate_method = {
                'url': '',
                'method': 'get',
                'province': 'gd',
                'companyName': company_name,
                'data': com_list,
            }

            return res, asic_dict, gate_method

        except Exception, e:
            logger.info(e)
            res['companyName'] = companyName
            gate_method = {
                'url': '',
                'method': 'get',
                'province': 'gd',
                'companyName': companyName,
                'data': com_list,
            }
            return res, None, gate_method


def search(companyName):
    res = search2(companyName)
    if not res:
        return None
    else:
        return res[1]


def search3(gate_method):
    if 'data' not in gate_method:
        raise Exception("gate_method error, doesn't have `data` key")
    com_list = gate_method.get('data')
    res = get_company_info(com_list)
    companyName = gate_method.get('companyName', '')

    raw_dict = res
    try:
        asic_dict = extract_base_info(raw_dict)
        year_list = extract_year_info(raw_dict)
        company_name = asic_dict['basicList'][0].get('enterpriseName', '')
        company_name = company_name if company_name else companyName
        res['companyName'] = company_name

        asic_dict['yearReportList'] = year_list
        gate_method = {
            'url': '',
            'method': 'get',
            'province': 'gd',
            'companyName': company_name,
            'data': com_list,
        }

        return res, asic_dict, gate_method

    except Exception, e:
        logger.info(e)
        res['companyName'] = companyName
        gate_method = {
            'url': '',
            'method': 'get',
            'province': 'gd',
            'companyName': companyName,
            'data': com_list,
        }
        return res, None, gate_method


if __name__ == "__main__":
    # companyName = '广东电网揭阳揭东供电局'
    # companyName = '广东新南方集团有限公司'
    # companyName = '东莞市锐国五金制品有限公司'

    # companyName = '广州丽滋摩塔贸易有限公司'
    # companyName = '广州申奥新能源科技有限公司'

    """
    第一类
    """
    # http://www.szcredit.com.cn/
    # 经营范围需要特殊处理
    # companyName = '广东新南方集团深圳投资有限公司'
    #
    # companyName = '深圳惠盐高速公路有限公司'
    # companyName = '深圳市盘古天地投资管理集团'
    # companyName = '惠州亿纬锂能股份有限公司'
    # companyName = '汕尾德昌电子有限公司深圳赛格高新电子市场经销部'
    # companyName = '潮州三环集团股份有限公司深圳分公司'
    # 变更,人员很多
    # companyName = '深圳市雄韬电源科技股份有限公司'

    # 透支人很多
    # companyName = '深圳市兆驰投资有限公司'

    # 变更很多
    # companyName = '腾讯科技（深圳）有限公司'

    # 经营异常信息
    # companyName = '深圳市银轩高速公路工程有限公司'

    """
    第二、三类
    """
    # http://gsxt.gdgs.gov.cn/aiccips/GSpublicity/GSpublicityList.html
    # companyName = '广东彩艳股份有限公司'
    # companyName = '广州汽车集团股份有限公司汽车工程研究院'
    # companyName = '广州腾讯科技有限公司'
    # companyName = '湛江通用电气股份有限公司'
    # companyName = '广东肇庆星湖生物科技股份有限公司广州分公司'
    # companyName = '惠州亿纬锂能股份有限公司'
    # companyName = '梅州市客都实业有限公司'
    # companyName = '广东嘉应制药股份有限公司'
    # companyName = '汕尾德昌电子有限公司'
    # companyName = '河源市大华股份有限公司'
    # companyName = '清远通业股份有限公司'
    # companyName = '揭阳国泰实业股份有限公司'
    # companyName = '广东省云浮万通建材集团股份有限公司'

    # 股东翻页，人员翻页，分支翻页
    # companyName = '广东风华高新科技股份有限公司'
    # companyName = '广州高澜节能技术股份有限公司'
    # companyName = '湛江国联水产开发股份有限公司'
    # companyName = '广东肇庆星湖生物科技股份有限公司'

    # 变更翻页，分支翻页
    # companyName = '广东新宝电器股份有限公司'

    # 变更翻页，人员翻页
    # companyName = '广东韶钢松山股份有限公司'
    # companyName = '中山联合光电科技股份有限公司'

    # 股东翻页，有分支
    # companyName = '珠海派诺科技股份有限公司'

    # 股东翻页，变更翻页，人员翻页，
    # companyName = '广东德联集团股份有限公司'
    # companyName = '珠海市魅族科技有限公司'
    # companyName = '中山大洋电机股份有限公司'

    # 股东翻页，变更翻页
    # companyName = '广州华工百川科技有限公司'

    # 人员翻页
    # companyName = '广东明珠集团股份有限公司'
    # companyName = '广州汽车集团股份有限公司'
    # companyName = '广州发展集团股份有限公司'
    # companyName = '广船国际有限公司'
    # companyName = '珠海港股份有限公司'
    # companyName = '珠海格力电器股份有限公司'
    # companyName = '广东汕头超声电子股份有限公司'
    # companyName = '茂名石化实华股份有限公司'
    # companyName = '清远金穗股份有限公司'
    # companyName = '东莞宜安科技股份有限公司'
    # companyName = '潮州华丰集团股份有限公司'

    # 股东翻页，人员翻页
    # companyName = '广东世荣兆业股份有限公司'
    # companyName = '佛山市蓝箭电子股份有限公司'
    # companyName = '茂名市粤能电力股份有限公司'
    # companyName = '河源富马硬质合金股份有限公司'
    # 股东很多14页
    # companyName = '中山达华智能科技股份有限公司'

    # 股东翻页、
    # companyName = '汕头鳗联股份有限公司'
    # companyName = '江门市蒙德电气股份有限公司'
    # companyName = '广东湛江吉民药业股份有限公司'
    # companyName = '惠州沃典科技股份有限公司'
    # companyName = '阳江龙达集团股份有限公司'
    # companyName = '中山沃尔夫机电股份有限公司'

    # 抽查检查
    # companyName = '佛山市万丽斯顿机械设备有限公司'
    # companyName = '佛山市嘉丽华化妆品有限公司'
    # companyName = '江海区林氏按摩店'

    # 经营异常
    # companyName = '东莞市王记贸易有限公司'
    # companyName = '东莞市汇嫣服饰有限公司'


    # companyName = '深圳市炬鑫源塑胶五金有限公司'
    companyName = '深圳市民航集团新产业控股有限公司'

    res = search2(companyName)
    print json.dumps(res, indent=4, ensure_ascii=False)

    # import pymongo
    # import json
    # clientServer = pymongo.MongoClient('192.168.31.121',27017)
    # db = clientServer.crawler_company_name
    # collectionServer = db.companyName
    # reg_no_s = collectionServer.find({'province': 'gd'}).skip(150).batch_size(10)
    # for reg in reg_no_s:
    #     print '#'*10
    #     print reg
    #     print '#'*10
    #     companyName = reg['companyName']
    #     print companyName
    #     try_time = 20
    #     com_url = ""
    #     while try_time > 0 and com_url == "":
    #         try:
    #             com_url = download_captcha_kill(companyName)
    #             try_time -= 1
    #         except Exception, e:
    #             try_time -= 1
    #             logger.exception(e)
    #
